{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ur8xi4C7S06n"
      },
      "outputs": [],
      "source": [
        "# Copyright 2025 Google LLC\n",
        "#\n",
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JAPoU8Sm5E6e"
      },
      "source": [
        "# Evaluate groundedness with custom parsing"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "<table align=\"left\">\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg\" alt=\"Google Colaboratory logo\"><br> Open in Colab\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaltask_approach%2Fevaluate_groundedness_with_custom_parsing.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN\" alt=\"Google Cloud Colab Enterprise logo\"><br> Open in Colab Enterprise\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\">\n",
        "      <img src=\"https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg\" alt=\"Vertex AI logo\"><br> Open in Vertex AI Workbench\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg\" alt=\"GitHub logo\"><br> View on GitHub\n",
        "    </a>\n",
        "  </td>\n",
        "</table>\n",
        "\n",
        "<div style=\"clear: both;\"></div>\n",
        "\n",
        "<b>Share to:</b>\n",
        "\n",
        "<a href=\"https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg\" alt=\"LinkedIn logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg\" alt=\"Bluesky logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg\" alt=\"X logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png\" alt=\"Reddit logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaltask_approach/evaluate_groundedness_with_custom_parsing.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg\" alt=\"Facebook logo\">\n",
        "</a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UBAgFfnqvxy0"
      },
      "source": [
        " | | |\n",
        " |-|-|\n",
        " |Author(s): | [Greg Breard](https://github.com/gregbreard) |"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tvgnzT1CKxrO"
      },
      "source": [
        "## Overview\n",
        "\n",
        "This Colab shows how to assess the grounding capabilities of generative models, evaluating their ability to generate responses that are factually consistent with and derived from a given textual context. Assessing grounding with custom parsing, as shown here, offers a more advanced evaluation compared to the standard [metric prompt templates](https://cloud.google.com/vertex-ai/generative-ai/docs/models/metrics-templates), allowing for detailed analysis of factual consistency and source relevance.\n",
        "\n",
        "It goes beyond basic information retrieval by examining the extent to which model outputs are truly rooted in the provided source and can synthesize information accurately from that context. The approach to grounding demonstrated here is based on the [FACTS Grounding Benchmark](https://www.kaggle.com/facts-leaderboard/examples).\n",
        "\n",
        "This is accomplished using the Vertex Gen AI Evaluation SDK which supports custom output parsing. The prompt instructs the autorater model to return a structured (JSON) output which is then parsed seemlessly by providing the parsing method in the metric definition. The parsed output is appended to the evaluation result data frame.\n",
        "\n",
        "## Objective\n",
        "\n",
        "1. Generate structured output from an autorater.\n",
        "2. Use custom parsing for advanced evalutation.\n",
        "\n",
        "## Steps\n",
        "\n",
        "1. Set up the environment.\n",
        "2. Define helper functions, prompt templates, and metric.\n",
        "3. Prepare the dataset for evaluation.\n",
        "4. Run the evaluation (including model inference).\n",
        "\n",
        "## Costs\n",
        "This tutorial uses billable components of Google Cloud:\n",
        "\n",
        "- Vertex AI\n",
        "\n",
        "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "61RBz8LLbxCR"
      },
      "source": [
        "# Get started"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "No17Cw5hgx12"
      },
      "source": [
        "## Install Vertex AI SDK for Python and other required packages\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tFy3H3aPgx12"
      },
      "outputs": [],
      "source": [
        "%pip install --upgrade --quiet google-cloud-aiplatform"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "R5Xep4W9lq-Z"
      },
      "source": [
        "## Restart runtime (Colab only)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NTGpLRykv37-"
      },
      "source": [
        "To use the newly installed packages, you must restart the runtime on Google Colab."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XRvKdaPDTznN"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "\n",
        "if \"google.colab\" in sys.modules:\n",
        "\n",
        "    import IPython\n",
        "\n",
        "    app = IPython.Application.instance()\n",
        "    app.kernel.do_shutdown(True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SbmM4z7FOBpM"
      },
      "source": [
        "<div class=\"alert alert-block alert-warning\">\n",
        "<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>\n",
        "</div>\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dmWOrTJ3gx13"
      },
      "source": [
        "## Authenticate your notebook environment (Colab only)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7pAMQL6bwGKg"
      },
      "source": [
        "Authenticate your environment on Google Colab."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NyKGtVQjgx13"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "\n",
        "if \"google.colab\" in sys.modules:\n",
        "\n",
        "    from google.colab import auth\n",
        "\n",
        "    auth.authenticate_user()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DF4l8DTdWgPY"
      },
      "source": [
        "## Set Google Cloud project information and initialize Vertex AI SDK for Python"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xdZEC-KuwKOM"
      },
      "source": [
        "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Nqwi-5ufWp_B"
      },
      "outputs": [],
      "source": [
        "PROJECT_ID = \"your-project-id\"  # @param {type:\"string\"}\n",
        "LOCATION = \"us-central1\"  # @param {type:\"string\"}\n",
        "\n",
        "\n",
        "import vertexai\n",
        "\n",
        "vertexai.init(project=PROJECT_ID, location=LOCATION)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EdvJRUWRNGHE"
      },
      "source": [
        "## Import libraries"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "R0peQPa48oTg"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "from vertexai.generative_models import GenerativeModel\n",
        "from vertexai.preview.evaluation import (\n",
        "    AutoraterConfig,\n",
        "    CustomOutputConfig,\n",
        "    EvalTask,\n",
        "    PointwiseMetric,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UwS2qQaxNTyy"
      },
      "source": [
        "# Set up evaluation"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "e-TDamAG9mkM"
      },
      "source": [
        "## Helper functions"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1pjVIACIMYne"
      },
      "source": [
        "The following functions provide support for extracting JSON objects from the results returned by the autorater and computing the model response score. Additionally, there are pretty printing methods to improve the readability of the evaluation results."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GfLmFxkP-Kvr"
      },
      "outputs": [],
      "source": [
        "import json\n",
        "import re\n",
        "from typing import Any\n",
        "\n",
        "_TABLE_STYLE = [\n",
        "    {\n",
        "        \"selector\": \"th\",\n",
        "        \"props\": [\n",
        "            (\"background-color\", \"#f2f2f2\"),\n",
        "            (\"border\", \"1px solid gray\"),\n",
        "            (\"color\", \"black\"),\n",
        "            (\"font-size\", \"11pt\"),\n",
        "            (\"text-align\", \"center\"),\n",
        "            (\"word-break\", \"break-all\"),\n",
        "        ],\n",
        "    },\n",
        "    {\n",
        "        \"selector\": \"td\",\n",
        "        \"props\": [\n",
        "            (\"border\", \"1px solid gray\"),\n",
        "            (\"color\", \"black\"),\n",
        "            (\"min-width\", \"100px\"),\n",
        "            (\"text-align\", \"center\"),\n",
        "        ],\n",
        "    },\n",
        "    {\"selector\": \"tr:nth-child(even)\", \"props\": [(\"background-color\", \"#f9f9f9\")]},\n",
        "    {\"selector\": \"tr:nth-child(odd)\", \"props\": [(\"background-color\", \"white\")]},\n",
        "    {\"selector\": \"tr:hover\", \"props\": [(\"background-color\", \"#94e6ff\")]},\n",
        "    {\"selector\": \"td:hover\", \"props\": [(\"background-color\", \"#ffffb3\")]},\n",
        "]\n",
        "\n",
        "\n",
        "def parse_response_to_json(responses: list[str]) -> dict[str, Any]:\n",
        "    response = re.sub(\n",
        "        r\"(.*```json|```.*)\",\n",
        "        \"\",\n",
        "        responses[0].strip(),\n",
        "    )\n",
        "    sentences = []\n",
        "    verdicts = []\n",
        "    score = 0\n",
        "    try:\n",
        "        sentences = json.loads(response)\n",
        "        for sentence in sentences:\n",
        "            verdicts.append(sentence[\"label\"])\n",
        "        successful = 0\n",
        "        for verdict in verdicts:\n",
        "            if verdict == \"supported\":\n",
        "                successful += 1\n",
        "            elif verdict in [\"unsupported\", \"contradictory\"]:\n",
        "                successful = 0\n",
        "                break\n",
        "        score = successful / len(verdicts)\n",
        "    except Exception as e:\n",
        "        print(f\"Failed to parse JSON response: {str(e)}\")\n",
        "    return {\n",
        "        \"sentence\": sentences,\n",
        "        \"sentence_verdict\": verdicts,\n",
        "        \"model_resp_score\": score,\n",
        "    }\n",
        "\n",
        "\n",
        "def pretty_print_df(df: \"pd.DataFrame\", hide_columns: list[str]) -> \"pd.Styler\":\n",
        "    styled_df = df.copy()\n",
        "    for col in df.columns:\n",
        "        if (\n",
        "            isinstance(df[col][0], list)\n",
        "            and df[col][0]\n",
        "            and isinstance(df[col][0][0], dict)\n",
        "        ):\n",
        "            styled_df[col] = styled_df[col].apply(lambda x: _list_to_html_table(x))\n",
        "    return (\n",
        "        styled_df.style.hide(axis=\"index\")\n",
        "        .hide(subset=hide_columns, axis=1)\n",
        "        .format({\"groundedness/model_resp_score\": \"{:,.1f}\"})\n",
        "        .set_table_styles(_TABLE_STYLE)\n",
        "    )\n",
        "\n",
        "\n",
        "def _list_to_html_table(data: list[dict[str, Any]]) -> str:\n",
        "    if not data:\n",
        "        return \"<i>No data to display.</i>\"\n",
        "    html_table = \"<table style='border-collapse: collapse'><thead><tr>\"\n",
        "    # Extract headers from the first element\n",
        "    for key in data[0].keys():\n",
        "        html_table += f\"<th>{key}</th>\"\n",
        "    html_table += \"</tr></thead><tbody>\"\n",
        "    # Add rows\n",
        "    for item in data:\n",
        "        html_table += \"<tr>\"\n",
        "        for value in item.values():\n",
        "            html_table += f\"<td>{value}</td>\"\n",
        "        html_table += \"</tr>\"\n",
        "    html_table += \"</tbody></table>\"\n",
        "    return html_table"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tLn3FteP9ttL"
      },
      "source": [
        "## Prompt Templates"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5--M9Kcz-LFt"
      },
      "outputs": [],
      "source": [
        "GROUNDING_AUTORATER_PROMPT = \"\"\"\n",
        "You are a helpful and harmless AI assistant. You will be provided with a textual\n",
        "context and a model-generated response.\n",
        "Your task is to analyze the response sentence by sentence and classify each\n",
        "sentence according to its relationship with the provided context.\n",
        "\n",
        "**Instructions:**\n",
        "\n",
        "1. **Decompose the response into individual sentences.**\n",
        "2. **For each sentence, assign one of the following labels:**\n",
        "    * **`supported`**: The sentence is entailed by the given context.  Provide a\n",
        "      supporting excerpt from the context. The supporting excerpt must *fully*\n",
        "      entail the sentence. If you need to cite multiple supporting excerpts,\n",
        "      simply concatenate them.\n",
        "    * **`unsupported`**: The sentence is not entailed by the given context. No\n",
        "      excerpt is needed for this label.\n",
        "    * **`contradictory`**: The sentence is falsified by the given context.\n",
        "      Provide a contradicting excerpt from the context.\n",
        "    * **`no_rad`**: The sentence does not require factual attribution (e.g.,\n",
        "      opinions, greetings, questions, disclaimers).  No excerpt is needed for\n",
        "      this label.\n",
        "3. **For each label, provide a short rationale explaining your decision.**\n",
        "   The rationale should be separate from the excerpt.\n",
        "4. **Be very strict with your `supported` and `contradictory` decisions.**\n",
        "   Unless you can find straightforward, indisputable evidence excerpts *in the\n",
        "   context* that a sentence is `supported` or `contradictory`, consider it\n",
        "   `unsupported`. You should not employ world knowledge unless it is truly\n",
        "   trivial.\n",
        "\n",
        "**Input Format:**\n",
        "\n",
        "The input will consist of two parts, clearly separated:\n",
        "\n",
        "* **Context:**  The textual context used to generate the response.\n",
        "* **Response:** The model-generated response to be analyzed.\n",
        "\n",
        "**Output Format:**\n",
        "\n",
        "For each sentence in the response, output a JSON object with the following\n",
        "fields:\n",
        "\n",
        "* `\"sentence\"`: The sentence being analyzed.\n",
        "* `\"label\"`: One of `supported`, `unsupported`, `contradictory`, or `no_rad`.\n",
        "* `\"rationale\"`: A brief explanation for the assigned label.\n",
        "* `\"excerpt\"`:  A relevant excerpt from the context. Only required for\n",
        "  `supported` and `contradictory` labels.\n",
        "\n",
        "Output each JSON object on a new line.\n",
        "\n",
        "**Example:**\n",
        "\n",
        "**Input:**\n",
        "\n",
        "```\n",
        "Context:\n",
        "Apples are red fruits. Bananas are yellow fruits.\n",
        "\n",
        "Response:\n",
        "Apples are red. Bananas are green. Bananas are cheaper than apples. Enjoy your fruit!\n",
        "```\n",
        "\n",
        "**Output:**\n",
        "\n",
        "[\n",
        "{\"sentence\": \"Apples are red.\", \"label\": \"supported\", \"rationale\": \"The context explicitly states that apples are red.\", \"excerpt\": \"Apples are red fruits.\"}\n",
        "{\"sentence\": \"Bananas are green.\", \"label\": \"contradictory\", \"rationale\": \"The context states that bananas are yellow, not green.\", \"excerpt\": \"Bananas are yellow fruits.\"}\n",
        "{\"sentence\": \"Bananas are cheaper than apples.\", \"label\": \"unsupported\", \"rationale\": \"The context does not mention the price of bananas or apples.\", \"excerpt\": null}\n",
        "{\"sentence\": \"Enjoy your fruit!\", \"label\": \"no_rad\", \"rationale\": \"This is a general expression and does not require factual attribution.\", \"excerpt\": null}\n",
        "]\n",
        "\n",
        "**Now, please analyze the following context and response:**\n",
        "\n",
        "**User Query:**\n",
        "{query}\n",
        "\n",
        "**Context:**\n",
        "{context}\n",
        "\n",
        "**Response:**\n",
        "{response}\n",
        "\"\"\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GqhaUNqXqcvE"
      },
      "outputs": [],
      "source": [
        "RESPONSE_PROMPT_TEMPLATE = \"\"\"\n",
        "Using only the information included in the context block, answer the user query\n",
        "in 5 sentences or less.\n",
        "\n",
        "**User Query:**\n",
        "{query}\n",
        "\n",
        "**Context:**\n",
        "{context}\n",
        "\"\"\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Fdtcd-WF9ymb"
      },
      "source": [
        "## Define the metric"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CeA0mrWi-LX1"
      },
      "outputs": [],
      "source": [
        "grounded_metric = PointwiseMetric(\n",
        "    metric=\"groundedness\",\n",
        "    metric_prompt_template=GROUNDING_AUTORATER_PROMPT,\n",
        "    custom_output_config=CustomOutputConfig(\n",
        "        return_raw_output=True,\n",
        "        parsing_fn=parse_response_to_json,\n",
        "    ),\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nHjCRree99Px"
      },
      "source": [
        "# Prepare the dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "U37FNLD8-Lp5"
      },
      "outputs": [],
      "source": [
        "# source: https://www.kaggle.com/datasets/deepmind/facts-grounding-examples/data?select=examples.csv\n",
        "queries = [\n",
        "    \"What advantages do offline vs online retailers have?\",\n",
        "    \"What are the requirements of OPM?\",\n",
        "    \"How does virtual memory improve the efficiency of real physical memory (RAM) usage in computer systems?\",\n",
        "    \"List the reasons that resulted in decreased emission of greenhouse gases from ethanol production.\",\n",
        "]\n",
        "contexts = [\n",
        "    \"\"\"Due to savings in inventory costs, online retailing has a big advantage in selling\n",
        "less popular items (the long tail) and in removing geographic barriers to purchase. Brynjolfsson\n",
        "et al. (2003) estimated that the significantly increased assortment of books available online\n",
        "increased consumer welfare by $700 million to a billion dollars in 2000. In comparing the online\n",
        "sales of a clothing retailer with the catalog sales of the same item, Brynjolfsson et al. (2009)\n",
        "showed that online sales of niche items were less sensitive to competition from offline stores\n",
        "than from catalog sales because the online sales were skewed toward niche items. Brynjolfsson et\n",
        "al. (2011) showed that online sales of niche items increased with recommendations and search tools,\n",
        "indicating that these tools lowered search costs, making it easier for consumers to locate them.\n",
        "In sum, because of the ability to handle more extensive inventories and provide search tools that\n",
        "facilitate locating niche items, online retailing has a comparative advantage in selling less popular\n",
        "items, translating into substantial benefits for consumers.\n",
        "As noted above, online sellers have an advantage in facilitating a search for information on digital\n",
        "attributes (including price). In contrast, offline sellers have an advantage in providing information\n",
        "on non- digital attributes and providing faster delivery. This leads to the possibility that consumers\n",
        "will search among both online and offline retailers.\"\"\",\n",
        "    \"\"\"As part of the assessment, S. 4043 would require OPM to explain whether each agency met its telework\n",
        "goals and, if not, the actions being taken to identify and eliminate barriers to meeting them. The annual\n",
        "report would also discuss additional steps that are planned by agencies to ensure telework oversight and\n",
        "quality control and increase the utilization rates of office building space owned or leased by the agencies.\n",
        "S. 4043 also requires the Office of Management and Budget (OMB), in consultation with GSA and the\n",
        "Federal Real Property Council, to develop benchmarks and guidance for executive agencies to use when\n",
        "calculating building utilization rates. S. 4043 would then require each executive agency head to establish\n",
        "(1) a system to track office building space utilization rates consistent with that OMB guidance and (2)\n",
        "indicators that measure the effects of telework policy on the management of real and personal property,\n",
        "among other things.\n",
        "S. 4043 would also require OPM to establish data standards to aid telework reporting requirements and\n",
        "for automated telework tracking within payroll systems used by agencies. S. 4043 would require OPM, in\n",
        "turn, to create an online tool that makes the standardized and reported data publicly available and would\n",
        "allow OPM to use the online tool to fulfill its annual reporting requirements. For a more detailed\n",
        "discussion of the bill's provisions on telework data standards, including office building utilization data,\n",
        "see CRS Insight IN12352, Establishing Data Standards and Measuring Building Use: Select Provisions\n",
        "of the Telework Transparency Act of 2024 (S. 4043).\"\"\",\n",
        "    \"\"\"Virtual memory is a computer system technique which gives an application program\n",
        "the impression that it has contiguous working memory (an address space), while in fact it may be\n",
        "physically fragmented and may even overflow on to disk storage. Systems that use this technique\n",
        "make programming of large applications easier and use real physical memory (e.g.RAM) more\n",
        "efficiently than those without virtual memory.\n",
        "http://en.wikipedia.org/wiki/Virtual_memory\n",
        "Page Fault: A page is a fixed-length block of memory that is used as a unit of transfer\n",
        "between physical memory and external storage like a disk, and a page fault is an interrupt (or\n",
        "exception) to the software raised by the hardware, when a program accesses a page that is\n",
        "mapped in address space, but not loaded in physical memory.\n",
        "http://en.wikipedia.org/wiki/Page_fault\n",
        "Thrash is the term used to describe a degenerate situation on a computer where increasing\n",
        "resources are used to do a decreasing amount of work. In this situation the system is\n",
        "said to be thrashing. Usually it refers to two or more processes accessing a shared resource\n",
        "repeatedly such that serious system performance degradation occurs because the system is\n",
        "spending a disproportionate amount of time just accessing the shared resource. Resource\n",
        "access time may generally be considered as wasted, since it does not contribute to the\n",
        "advancement of any process. In modern computers, thrashing may occur in the paging system\n",
        "(if there is not 'sufficient' physical memory or the disk access time is overly long), or in the\n",
        "communications system (especially in conflicts over internal bus access), etc.\n",
        "http://en.wikipedia.org/wiki/Thrash_(computer_science)\"\"\",\n",
        "    \"\"\"A new USDA report, titled 'A Life-Cycle Analysis of the Greenhouse Gas Emissions of Corn-Based\n",
        "Ethanol,' finds that greenhouse gas emissions associated with producing corn-based ethanol in\n",
        "the United States are about 43 percent lower than gasoline when measured on an energy equivalent\n",
        "basis. Unlike other studies of greenhouse gas benefits, which relied on forecasts of future ethanol production\n",
        "systems and expected impacts on the farm sector, this study reviewed how the industry and farm\n",
        "sectors have performed over the past decade to assess the current greenhouse gas profile of corn-based ethanol.\n",
        "The report shows that the reductions in greenhouse gas emissions were driven by a variety of improvements in\n",
        "ethanol production, spanning from the corn field to the ethanol refinery. Farmers are producing corn\n",
        "more efficiently and using conservation practices that reduce greenhouse gas emissions, including reduced tillage,\n",
        "cover crops, and improved nitrogen management. Both corn yields and the efficiency of ethanol\n",
        "production technologies are also improving.\n",
        "Previous estimates of ethanol's greenhouse gas balance report lower efficiencies, largely due to anticipated\n",
        "conversion of grasslands and forests to commodity production as a result of increased demand for corn\n",
        "used in ethanol production. However, recent studies of international agricultural land use trends show\n",
        "that since 2004, the primary land use change response of the world's farmers to rising commodity prices\n",
        "has been to use available land resources more efficiently rather than to expand the amount of land used\n",
        "for farming.\"\"\",\n",
        "]\n",
        "\n",
        "eval_dataset = pd.DataFrame(\n",
        "    {\n",
        "        \"query\": queries,\n",
        "        \"context\": contexts,\n",
        "    }\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qm6mHoPc-DS0"
      },
      "source": [
        "# Run evaluation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mMuDxiwy-MTt"
      },
      "outputs": [],
      "source": [
        "eval_task = EvalTask(\n",
        "    dataset=eval_dataset,\n",
        "    metrics=[grounded_metric],\n",
        "    autorater_config=AutoraterConfig(sampling_count=1),\n",
        ")\n",
        "\n",
        "# Model to use for generating responses to evaluate.\n",
        "eval_model = GenerativeModel(model_name=\"gemini-2.0-flash-001\")\n",
        "\n",
        "eval_result = eval_task.evaluate(\n",
        "    model=eval_model,\n",
        "    prompt_template=RESPONSE_PROMPT_TEMPLATE,\n",
        ")\n",
        "\n",
        "# Calculate overall score for metric.\n",
        "np.mean(eval_result.metrics_table[\"groundedness/model_resp_score\"])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "e93FLC-xjtZU"
      },
      "outputs": [],
      "source": [
        "pretty_print_df(\n",
        "    eval_result.metrics_table, hide_columns=[\"prompt\", \"groundedness/sentence_verdict\"]\n",
        ")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [
        "dmWOrTJ3gx13",
        "EdvJRUWRNGHE",
        "tLn3FteP9ttL",
        "nHjCRree99Px"
      ],
      "name": "evaluate_groundedness_with_custom_parsing.ipynb",
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
